Разбор задач

Основы программирования на R

Антон Антонов

Logo here

Curious stats

options(stringsAsFactors = FALSE)
df <- read.csv("../course-497-submissions-full.csv")
rbind(
  total = summary(factor(df$status)),
  count_elements = summary(factor(df$status[df$step_id == 86100])),
  build_ziggurat = summary(factor(df$status[df$step_id == 86098])),
  avian = summary(factor(df$status[df$step_id == 86882])),
  binary_op = summary(factor(df$status[df$step_id == 93343]))
)
##                correct wrong
## total            29463 32983
## count_elements     340   431
## build_ziggurat     340   422
## avian              297   257
## binary_op          255   246

count_elements

count_sol <- df[df$step_id == 86100 & df$status == "correct", "reply"]
c(
  length(count_sol[str_detect(count_sol, "for")]),
  length(count_sol[str_detect(count_sol, "apply")]),
  length(count_sol[str_detect(count_sol, "table")]),
  length(count_sol)
)
## [1] 108  94 112 340
count_elements_sapply <- function(x) {
  y <- sort(unique(x)) # or unique(sort(x))?
  rbind(y, sapply(y, function(a) sum(a == x)))
}

count_elements

count_elements_table <- function(x) {
  t <- table(x)
  rbind(as.numeric(rownames(t)), t) # a bit clumsy but whatever
}
length(count_sol[!str_detect(count_sol, "table|apply|for")])
## [1] 32
count_elements_rle <- function(x) {
  a <- rle(sort(x))
  matrix(c(a$values, a$lengths), nrow = 2, byrow = T)
}

count_elements

set.seed(1825); x <- sample(1:100, 1e5, replace = TRUE) # times = 200

build_ziggurat

zig_sol <- df[df$step_id == 86098 & df$status == "correct", "reply"]
c(
  length(zig_sol), 
  length(zig_sol[str_count(zig_sol, "for") > 1]), length(zig_sol[str_count(zig_sol, "for") == 1]),
  length(zig_sol[!str_detect(zig_sol, "for|while|repeat")]), 
  length(zig_sol[str_count(zig_sol, "function") > 1])
)
## [1] 340  39 198  51  33
build_ziggurat_outer <- function(n) {
  s <- 2 * n - 1
  outer(1:s, 1:s, function(x, y) {
  x <- n - abs(n - x)
  y <- n - abs(n - y)
  pmin(x, y)
  })
}

build_ziggurat

build_ziggurat_for <- function(n) {
  w <- c(1:n, (n-1):1)
  l <- length(w)
  mat <- matrix(nrow=l, ncol=l)
  for(p in 1:l) {
    for (q in 1:l) {
      mat[p, q] <- min(w[p], w[q])
    }
  }
  if (n==1) mat <- matrix(1)
  mat
}
build_ziggurat_apply <- function(n) {
  tmp <- matrix(c(1:((n*2-1)^2)), nrow = n*2-1, ncol = n*2-1)
  return(n - apply(tmp, 1:2, function(x) max(abs(c(n,n) - which(tmp == x, arr.ind = T)))))
}

build_ziggurat

build_ziggurat_recursive1 <- function(n) {
  ziggurat <- function (m, w1, w) {
    a <- (w - w1)/2 + 1
    b <- (w + w1)/2
    m[a:b, a:b] <- m[a:b, a:b] + 1
    if (w1 > 1) {
      ziggurat(m, w1 - 2, w)
    } else m
  }
  w <- 2*n - 1
  ziggurat(matrix(0, ncol=w, nrow=w), w, w)
}
build_ziggurat_recursive2 <- function(n, level = 1) {
  m <- matrix(level, nrow = n*2 - 1, ncol = n*2 - 1)
  if (n > 1) {
    m[2:(nrow(m) - 1), 2:(ncol(m) - 1)] <- build_ziggurat_recursive2(n - 1, level + 1) 
  }
  m
}

Reduce et al. : funprog is fun

Reduce(f, v): \(l_1 = f(v_1, v_2), l_2 = f(l_1, v_3), ..., l_{n-1} = f(l_{n-2}, v_n)\)

set.seed(1961); sample_pool <- 1:100
l <- replicate(20, sample(sample_pool, 10), simplify = FALSE)
setdiff(sample_pool, Reduce(union, l))
##  [1] 18 23 44 58 61 65 67 71 72 75 80 82 94 96
build_ziggurat_reduce <- function(n) {
  stage <- function(k, n) {
    m <- matrix(0, 2*n - 1, 2*n - 1)
    ind <- k:(2*n - k)
    m[ind, ind] <- 1
    m
  }
  Reduce(`+`, lapply(1:n, function(i) stage(i, n)))
}

build_ziggurat

n <- 50 # times = 40

avian

avian <- read.csv("avianHabitat.csv")
avian %>% 
  select(Site, Observer, contains("Ht")) %>% 
  mutate(Site = factor(str_replace(Site, "[:digit:]+", ""))) %>% 
  group_by(Site, Observer) %>% 
  summarise_each(funs(sum(. > 0)))
## Source: local data frame [11 x 8]
## Groups: Site [?]
## 
##               Site Observer  DBHt   WHt   EHt   AHt   HHt   LHt
##             (fctr)   (fctr) (int) (int) (int) (int) (int) (int)
## 1       BunkerHill       JT    53    36    63     5    68    15
## 2       BunkerHill       RA    56    46    65     6    70    24
## 3       CreteCreek       RA    49    42    78     0    91    78
## 4       CreteCreek       RR    43    32    67     0    80    74
## 5      HortonCreek       JT    39    38    70     0    75    65
## 6      HortonCreek       RA    49    60   116     0   123   110
## 7      HortonCreek       RR    21    21    45     1    49    40
## 8  LivingstonCreek       RA    49   105   106     0   134    95
## 9  LivingstonCreek       RR    47   115   106     0   154    98
## 10     McAdamCreek       RA    58   114   122     0   139    85
## 11     McAdamCreek       RR    28    38    49     0    50    30

avian

avian %>% 
  select(Site, Observer, contains("Ht")) %>% 
  mutate(Site = factor(str_replace(Site, "[:digit:]+", ""))) %>% 
  gather(Species, Height, -Site, -Observer) %>% 
  group_by(Site, Observer, Species) %>% 
  summarise(Result = sum(Height > 0)) %>% 
  filter(Result > 100)
## Source: local data frame [12 x 4]
## Groups: Site, Observer [4]
## 
##               Site Observer Species Result
##             (fctr)   (fctr)   (chr)  (int)
## 1      HortonCreek       RA     EHt    116
## 2      HortonCreek       RA     HHt    123
## 3      HortonCreek       RA     LHt    110
## 4  LivingstonCreek       RA     EHt    106
## 5  LivingstonCreek       RA     HHt    134
## 6  LivingstonCreek       RA     WHt    105
## 7  LivingstonCreek       RR     EHt    106
## 8  LivingstonCreek       RR     HHt    154
## 9  LivingstonCreek       RR     WHt    115
## 10     McAdamCreek       RA     EHt    122
## 11     McAdamCreek       RA     HHt    139
## 12     McAdamCreek       RA     WHt    114

avian

avian_base <- avian
coverage_variables <- names(avian)[grepl("Ht", names(avian))]
avian_base <- avian_base[, c("Site", "Observer", coverage_variables)]
avian_base$Site <- factor(gsub("\\d", "", avian_base$Site))
avian_base <- reshape(avian_base, direction = "long", varying = coverage_variables, 
                      v.names = "Height", timevar = "Species", times = coverage_variables)
avian_base <- avian_base[, c("Site", "Observer", "Species", "Height")]
subset(
  aggregate(avian_base[, "Height", drop = F], 
          list(Site = avian_base$Site, Observer = avian_base$Observer, Species = avian_base$Species), 
          function(x) sum(x > 0)), 
  Height > 100
)
##               Site Observer Species Height
## 27     HortonCreek       RA     EHt    116
## 28 LivingstonCreek       RA     EHt    106
## 29     McAdamCreek       RA     EHt    122
## 32 LivingstonCreek       RR     EHt    106
## 38     HortonCreek       RA     HHt    123
## 39 LivingstonCreek       RA     HHt    134
## 40     McAdamCreek       RA     HHt    139
## 43 LivingstonCreek       RR     HHt    154
## 49     HortonCreek       RA     LHt    110
## 61 LivingstonCreek       RA     WHt    105
## 62     McAdamCreek       RA     WHt    114
## 65 LivingstonCreek       RR     WHt    115

binary operator

"%+for%" <- function(x, y) {
  a <- max(length(x), length(y))
  v <- rep(NA, a)
  for (i in 1:a) { # a == 0 ??
    a[i] <- x[i] + y[i]
  }
  #print(a) # ??
  a
}
bin_sol <- df[df$step_id == 93343 & df$status == "correct", "reply"]
length(bin_sol[!str_detect(bin_sol, "for|if|1:")])
## [1] 31

binary operator

"%+len%" <- function(x, y) {
  length(x) <- length(y) <- max(length(x), length(y))
  x + y
}
"%+vap%" <- function(x, y) {
  ind <- max(length(x), length(y))
  vapply(seq_len(ind), function(i) x[i] + y[i], numeric(1)) # seq.int
}

binary operator

autoplot(microbenchmark(1e3 + 1e6, 1e3 %+for% 1e6, 1e3 %+len% 1e6, 1e3 %+vap% 1e6, times = 1000))

SPb R User group


http://vk.com/spbrug




Q&A